1   package org.apache.lucene.analysis.core;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.Reader;
23  import java.io.StringReader;
24  import java.lang.reflect.Constructor;
25  import java.lang.reflect.InvocationTargetException;
26  import java.lang.reflect.Modifier;
27  import java.net.URI;
28  import java.net.URL;
29  import java.nio.CharBuffer;
30  import java.nio.file.DirectoryStream;
31  import java.nio.file.Files;
32  import java.nio.file.Path;
33  import java.nio.file.Paths;
34  import java.util.ArrayList;
35  import java.util.Arrays;
36  import java.util.Collection;
37  import java.util.Collections;
38  import java.util.Comparator;
39  import java.util.Enumeration;
40  import java.util.HashMap;
41  import java.util.HashSet;
42  import java.util.IdentityHashMap;
43  import java.util.List;
44  import java.util.Map;
45  import java.util.Random;
46  import java.util.Set;
47  import java.util.regex.Pattern;
48  
49  import org.apache.lucene.analysis.Analyzer;
50  import org.apache.lucene.analysis.BaseTokenStreamTestCase;
51  import org.apache.lucene.analysis.CachingTokenFilter;
52  import org.apache.lucene.analysis.CharFilter;
53  import org.apache.lucene.analysis.CrankyTokenFilter;
54  import org.apache.lucene.analysis.MockGraphTokenFilter;
55  import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
56  import org.apache.lucene.analysis.MockTokenFilter;
57  import org.apache.lucene.analysis.MockTokenizer;
58  import org.apache.lucene.analysis.TokenFilter;
59  import org.apache.lucene.analysis.TokenStream;
60  import org.apache.lucene.analysis.Tokenizer;
61  import org.apache.lucene.analysis.ValidatingTokenFilter;
62  import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
63  import org.apache.lucene.analysis.cjk.CJKBigramFilter;
64  import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
65  import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
66  import org.apache.lucene.analysis.compound.Lucene43HyphenationCompoundWordTokenFilter;
67  import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
68  import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
69  import org.apache.lucene.analysis.hunspell.Dictionary;
70  import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
71  import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
72  import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
73  import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
74  import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
75  import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
76  import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
77  import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
78  import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
79  import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
80  import org.apache.lucene.analysis.payloads.IdentityEncoder;
81  import org.apache.lucene.analysis.payloads.PayloadEncoder;
82  import org.apache.lucene.analysis.snowball.TestSnowball;
83  import org.apache.lucene.analysis.standard.StandardTokenizer;
84  import org.apache.lucene.analysis.synonym.SynonymMap;
85  import org.apache.lucene.analysis.util.CharArrayMap;
86  import org.apache.lucene.analysis.util.CharArraySet;
87  import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
88  import org.apache.lucene.util.AttributeFactory;
89  import org.apache.lucene.util.AttributeSource;
90  import org.apache.lucene.util.CharsRef;
91  import org.apache.lucene.util.Rethrow;
92  import org.apache.lucene.util.TestUtil;
93  import org.apache.lucene.util.Version;
94  import org.apache.lucene.util.automaton.CharacterRunAutomaton;
95  import org.junit.AfterClass;
96  import org.junit.BeforeClass;
97  import org.tartarus.snowball.SnowballProgram;
98  import org.xml.sax.InputSource;
99  
100 /** tests random analysis chains */
101 public class TestRandomChains extends BaseTokenStreamTestCase {
102 
103   static List<Constructor<? extends Tokenizer>> tokenizers;
104   static List<Constructor<? extends TokenFilter>> tokenfilters;
105   static List<Constructor<? extends CharFilter>> charfilters;
106 
107   private static interface Predicate<T> {
108     boolean apply(T o);
109   }
110 
111   private static final Predicate<Object[]> ALWAYS = new Predicate<Object[]>() {
112     public boolean apply(Object[] args) {
113       return true;
114     };
115   };
116 
117   private static final Map<Constructor<?>,Predicate<Object[]>> brokenConstructors = new HashMap<>();
118   static {
119     try {
120       brokenConstructors.put(
121           LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class),
122           ALWAYS);
123       brokenConstructors.put(
124           LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class, boolean.class),
125           new Predicate<Object[]>() {
126             @Override
127             public boolean apply(Object[] args) {
128               assert args.length == 3;
129               return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
130             }
131           });
132       brokenConstructors.put(
133           LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class),
134           ALWAYS);
135       brokenConstructors.put(
136           LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class, boolean.class),
137           new Predicate<Object[]>() {
138             @Override
139             public boolean apply(Object[] args) {
140               assert args.length == 3;
141               return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
142             }
143           });
144       brokenConstructors.put(
145           LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class),
146           ALWAYS);
147       brokenConstructors.put(
148           LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class, boolean.class),
149           new Predicate<Object[]>() {
150             @Override
151             public boolean apply(Object[] args) {
152               assert args.length == 3;
153               return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
154             }
155           });
156       for (Class<?> c : Arrays.<Class<?>>asList(
157           // TODO: can we promote some of these to be only
158           // offsets offenders?
159           // doesn't actual reset itself!
160           CachingTokenFilter.class,
161           // Not broken, simulates brokenness:
162           CrankyTokenFilter.class,
163           // Not broken: we forcefully add this, so we shouldn't
164           // also randomly pick it:
165           ValidatingTokenFilter.class, 
166           // TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain)
167           WordDelimiterFilter.class)) {
168         for (Constructor<?> ctor : c.getConstructors()) {
169           brokenConstructors.put(ctor, ALWAYS);
170         }
171       }  
172     } catch (Exception e) {
173       throw new Error(e);
174     }
175   }
176 
177   // TODO: also fix these and remove (maybe):
178   // Classes/options that don't produce consistent graph offsets:
179   private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<>();
180   static {
181     try {
182       for (Class<?> c : Arrays.<Class<?>>asList(
183           ReversePathHierarchyTokenizer.class,
184           PathHierarchyTokenizer.class,
185           // TODO: it seems to mess up offsets!?
186           WikipediaTokenizer.class,
187           // TODO: doesn't handle graph inputs
188           CJKBigramFilter.class,
189           // TODO: doesn't handle graph inputs (or even look at positionIncrement)
190           HyphenatedWordsFilter.class,
191           // TODO: LUCENE-4983
192           CommonGramsFilter.class,
193           // TODO: doesn't handle graph inputs
194           CommonGramsQueryFilter.class)) {
195         for (Constructor<?> ctor : c.getConstructors()) {
196           brokenOffsetsConstructors.put(ctor, ALWAYS);
197         }
198       }
199     } catch (Exception e) {
200       throw new Error(e);
201     }
202   }
203 
204   @BeforeClass
205   public static void beforeClass() throws Exception {
206     List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
207     tokenizers = new ArrayList<>();
208     tokenfilters = new ArrayList<>();
209     charfilters = new ArrayList<>();
210     for (final Class<?> c : analysisClasses) {
211       final int modifiers = c.getModifiers();
212       if (
213         // don't waste time with abstract classes or deprecated known-buggy ones
214         Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
215         || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
216         || c.isAnnotationPresent(Deprecated.class)
217         || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
218       ) {
219         continue;
220       }
221       
222       for (final Constructor<?> ctor : c.getConstructors()) {
223         // don't test synthetic or deprecated ctors, they likely have known bugs:
224         if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) {
225           continue;
226         }
227         if (Tokenizer.class.isAssignableFrom(c)) {
228           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
229             allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
230           tokenizers.add(castConstructor(Tokenizer.class, ctor));
231         } else if (TokenFilter.class.isAssignableFrom(c)) {
232           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
233             allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
234           tokenfilters.add(castConstructor(TokenFilter.class, ctor));
235         } else if (CharFilter.class.isAssignableFrom(c)) {
236           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
237             allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
238           charfilters.add(castConstructor(CharFilter.class, ctor));
239         } else {
240           fail("Cannot get here");
241         }
242       }
243     }
244     
245     final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
246       @Override
247       public int compare(Constructor<?> arg0, Constructor<?> arg1) {
248         return arg0.toGenericString().compareTo(arg1.toGenericString());
249       }
250     };
251     Collections.sort(tokenizers, ctorComp);
252     Collections.sort(tokenfilters, ctorComp);
253     Collections.sort(charfilters, ctorComp);
254     if (VERBOSE) {
255       System.out.println("tokenizers = " + tokenizers);
256       System.out.println("tokenfilters = " + tokenfilters);
257       System.out.println("charfilters = " + charfilters);
258     }
259   }
260   
261   @AfterClass
262   public static void afterClass() {
263     tokenizers = null;
264     tokenfilters = null;
265     charfilters = null;
266   }
267   
268   /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
269    * {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
270   @SuppressWarnings("unchecked") 
271   private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
272     return (Constructor<T>) ctor;
273   }
274   
275   public static List<Class<?>> getClassesForPackage(String pckgname) throws Exception {
276     final List<Class<?>> classes = new ArrayList<>();
277     collectClassesForPackage(pckgname, classes);
278     assertFalse("No classes found in package '"+pckgname+"'; maybe your test classes are packaged as JAR file?", classes.isEmpty());
279     return classes;
280   }
281   
282   private static void collectClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
283     final ClassLoader cld = TestRandomChains.class.getClassLoader();
284     final String path = pckgname.replace('.', '/');
285     final Enumeration<URL> resources = cld.getResources(path);
286     while (resources.hasMoreElements()) {
287       final URI uri = resources.nextElement().toURI();
288       if (!"file".equalsIgnoreCase(uri.getScheme()))
289         continue;
290       final Path directory = Paths.get(uri);
291       if (Files.exists(directory)) {
292         try (DirectoryStream<Path> stream = Files.newDirectoryStream(directory)) {
293           for (Path file : stream) {
294             if (Files.isDirectory(file)) {
295               // recurse
296               String subPackage = pckgname + "." + file.getFileName().toString();
297               collectClassesForPackage(subPackage, classes);
298             }
299             String fname = file.getFileName().toString();
300             if (fname.endsWith(".class")) {
301               String clazzName = fname.substring(0, fname.length() - 6);
302               // exclude Test classes that happen to be in these packages.
303               // class.ForName'ing some of them can cause trouble.
304               if (!clazzName.endsWith("Test") && !clazzName.startsWith("Test")) {
305                 // Don't run static initializers, as we won't use most of them.
306                 // Java will do that automatically once accessed/instantiated.
307                 classes.add(Class.forName(pckgname + '.' + clazzName, false, cld));
308               }
309             }
310           }
311         }
312       }
313     }
314   }
315   
316   private static interface ArgProducer {
317     Object create(Random random);
318   }
319   
320   private static final Map<Class<?>,ArgProducer> argProducers = new IdentityHashMap<Class<?>,ArgProducer>() {{
321     put(int.class, new ArgProducer() {
322       @Override public Object create(Random random) {
323         // TODO: could cause huge ram usage to use full int range for some filters
324         // (e.g. allocate enormous arrays)
325         // return Integer.valueOf(random.nextInt());
326         return Integer.valueOf(TestUtil.nextInt(random, -50, 50));
327       }
328     });
329     put(char.class, new ArgProducer() {
330       @Override public Object create(Random random) {
331         // TODO: fix any filters that care to throw IAE instead.
332         // also add a unicode validating filter to validate termAtt?
333         // return Character.valueOf((char)random.nextInt(65536));
334         while(true) {
335           char c = (char)random.nextInt(65536);
336           if (c < '\uD800' || c > '\uDFFF') {
337             return Character.valueOf(c);
338           }
339         }
340       }
341     });
342     put(float.class, new ArgProducer() {
343       @Override public Object create(Random random) {
344         return Float.valueOf(random.nextFloat());
345       }
346     });
347     put(boolean.class, new ArgProducer() {
348       @Override public Object create(Random random) {
349         return Boolean.valueOf(random.nextBoolean());
350       }
351     });
352     put(byte.class, new ArgProducer() {
353       @Override public Object create(Random random) {
354         // this wraps to negative when casting to byte
355         return Byte.valueOf((byte) random.nextInt(256));
356       }
357     });
358     put(byte[].class, new ArgProducer() {
359       @Override public Object create(Random random) {
360         byte bytes[] = new byte[random.nextInt(256)];
361         random.nextBytes(bytes);
362         return bytes;
363       }
364     });
365     put(Random.class, new ArgProducer() {
366       @Override public Object create(Random random) {
367         return new Random(random.nextLong());
368       }
369     });
370     put(Version.class, new ArgProducer() {
371       @Override public Object create(Random random) {
372         // we expect bugs in emulating old versions
373         return Version.LATEST;
374       }
375     });
376     put(AttributeFactory.class, new ArgProducer() {
377       @Override public Object create(Random random) {
378         return newAttributeFactory(random);
379       }
380     });
381     put(Set.class, new ArgProducer() {
382       @Override public Object create(Random random) {
383         // TypeTokenFilter
384         Set<String> set = new HashSet<>();
385         int num = random.nextInt(5);
386         for (int i = 0; i < num; i++) {
387           set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
388         }
389         return set;
390       }
391     });
392     put(Collection.class, new ArgProducer() {
393       @Override public Object create(Random random) {
394         // CapitalizationFilter
395         Collection<char[]> col = new ArrayList<>();
396         int num = random.nextInt(5);
397         for (int i = 0; i < num; i++) {
398           col.add(TestUtil.randomSimpleString(random).toCharArray());
399         }
400         return col;
401       }
402     });
403     put(CharArraySet.class, new ArgProducer() {
404       @Override public Object create(Random random) {
405         int num = random.nextInt(10);
406         CharArraySet set = new CharArraySet(num, random.nextBoolean());
407         for (int i = 0; i < num; i++) {
408           // TODO: make nastier
409           set.add(TestUtil.randomSimpleString(random));
410         }
411         return set;
412       }
413     });
414     put(Pattern.class, new ArgProducer() {
415       @Override public Object create(Random random) {
416         // TODO: don't want to make the exponentially slow ones Dawid documents
417         // in TestPatternReplaceFilter, so dont use truly random patterns (for now)
418         return Pattern.compile("a");
419       }
420     });
421     
422     put(Pattern[].class, new ArgProducer() {
423       @Override public Object create(Random random) {
424         return new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")};
425       }
426     });
427     put(PayloadEncoder.class, new ArgProducer() {
428       @Override public Object create(Random random) {
429         return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
430       }
431     });
432     put(Dictionary.class, new ArgProducer() {
433       @Override public Object create(Random random) {
434         // TODO: make nastier
435         InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff");
436         InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic");
437         try {
438          return new Dictionary(affixStream, dictStream);
439         } catch (Exception ex) {
440           Rethrow.rethrow(ex);
441           return null; // unreachable code
442         }
443       }
444     });
445     put(HyphenationTree.class, new ArgProducer() {
446       @Override public Object create(Random random) {
447         // TODO: make nastier
448         try {
449           InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm());
450           HyphenationTree hyphenator = Lucene43HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
451           return hyphenator;
452         } catch (Exception ex) {
453           Rethrow.rethrow(ex);
454           return null; // unreachable code
455         }
456       }
457     });
458     put(SnowballProgram.class, new ArgProducer() {
459       @Override public Object create(Random random) {
460         try {
461           String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)];
462           Class<? extends SnowballProgram> clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class);
463           return clazz.newInstance();
464         } catch (Exception ex) {
465           Rethrow.rethrow(ex);
466           return null; // unreachable code
467         }
468       }
469     });
470     put(String.class, new ArgProducer() {
471       @Override public Object create(Random random) {
472         // TODO: make nastier
473         if (random.nextBoolean()) {
474           // a token type
475           return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
476         } else {
477           return TestUtil.randomSimpleString(random);
478         }
479       }
480     });
481     put(NormalizeCharMap.class, new ArgProducer() {
482       @Override public Object create(Random random) {
483         NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
484         // we can't add duplicate keys, or NormalizeCharMap gets angry
485         Set<String> keys = new HashSet<>();
486         int num = random.nextInt(5);
487         //System.out.println("NormalizeCharMap=");
488         for (int i = 0; i < num; i++) {
489           String key = TestUtil.randomSimpleString(random);
490           if (!keys.contains(key) && key.length() > 0) {
491             String value = TestUtil.randomSimpleString(random);
492             builder.add(key, value);
493             keys.add(key);
494             //System.out.println("mapping: '" + key + "' => '" + value + "'");
495           }
496         }
497         return builder.build();
498       }
499     });
500     put(CharacterRunAutomaton.class, new ArgProducer() {
501       @Override public Object create(Random random) {
502         // TODO: could probably use a purely random automaton
503         switch(random.nextInt(5)) {
504           case 0: return MockTokenizer.KEYWORD;
505           case 1: return MockTokenizer.SIMPLE;
506           case 2: return MockTokenizer.WHITESPACE;
507           case 3: return MockTokenFilter.EMPTY_STOPSET;
508           default: return MockTokenFilter.ENGLISH_STOPSET;
509         }
510       }
511     });
512     put(CharArrayMap.class, new ArgProducer() {
513       @Override public Object create(Random random) {
514         int num = random.nextInt(10);
515         CharArrayMap<String> map = new CharArrayMap<>(num, random.nextBoolean());
516         for (int i = 0; i < num; i++) {
517           // TODO: make nastier
518           map.put(TestUtil.randomSimpleString(random), TestUtil.randomSimpleString(random));
519         }
520         return map;
521       }
522     });
523     put(StemmerOverrideMap.class, new ArgProducer() {
524       @Override public Object create(Random random) {
525         int num = random.nextInt(10);
526         StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(random.nextBoolean());
527         for (int i = 0; i < num; i++) {
528           String input = ""; 
529           do {
530             input = TestUtil.randomRealisticUnicodeString(random);
531           } while(input.isEmpty());
532           String out = ""; TestUtil.randomSimpleString(random);
533           do {
534             out = TestUtil.randomRealisticUnicodeString(random);
535           } while(out.isEmpty());
536           builder.add(input, out);
537         }
538         try {
539           return builder.build();
540         } catch (Exception ex) {
541           Rethrow.rethrow(ex);
542           return null; // unreachable code
543         }
544       }
545     });
546     put(SynonymMap.class, new ArgProducer() {
547       @Override public Object create(Random random) {
548         SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
549         final int numEntries = atLeast(10);
550         for (int j = 0; j < numEntries; j++) {
551           addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean());
552         }
553         try {
554           return b.build();
555         } catch (Exception ex) {
556           Rethrow.rethrow(ex);
557           return null; // unreachable code
558         }
559       }
560       
561       private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
562         b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
563               new CharsRef(output.replaceAll(" +", "\u0000")),
564               keepOrig);
565       }
566       
567       private String randomNonEmptyString(Random random) {
568         while(true) {
569           final String s = TestUtil.randomUnicodeString(random).trim();
570           if (s.length() != 0 && s.indexOf('\u0000') == -1) {
571             return s;
572           }
573         }
574       }    
575     });
576   }};
577   
578   static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
579   static {
580     allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
581     allowedTokenizerArgs.addAll(argProducers.keySet());
582     allowedTokenizerArgs.add(Reader.class);
583     allowedTokenizerArgs.add(AttributeFactory.class);
584     allowedTokenizerArgs.add(AttributeSource.class);
585     
586     allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
587     allowedTokenFilterArgs.addAll(argProducers.keySet());
588     allowedTokenFilterArgs.add(TokenStream.class);
589     // TODO: fix this one, thats broken:
590     allowedTokenFilterArgs.add(CommonGramsFilter.class);
591     
592     allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
593     allowedCharFilterArgs.addAll(argProducers.keySet());
594     allowedCharFilterArgs.add(Reader.class);
595   }
596   
597   @SuppressWarnings("unchecked")
598   static <T> T newRandomArg(Random random, Class<T> paramType) {
599     final ArgProducer producer = argProducers.get(paramType);
600     assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
601     return (T) producer.create(random);
602   }
603   
604   static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) {
605     Object[] args = new Object[paramTypes.length];
606     for (int i = 0; i < args.length; i++) {
607       Class<?> paramType = paramTypes[i];
608       if (paramType == AttributeSource.class) {
609         // TODO: args[i] = new AttributeSource();
610         // this is currently too scary to deal with!
611         args[i] = null; // force IAE
612       } else {
613         args[i] = newRandomArg(random, paramType);
614       }
615     }
616     return args;
617   }
618   
619   static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) {
620     Object[] args = new Object[paramTypes.length];
621     for (int i = 0; i < args.length; i++) {
622       Class<?> paramType = paramTypes[i];
623       if (paramType == Reader.class) {
624         args[i] = reader;
625       } else {
626         args[i] = newRandomArg(random, paramType);
627       }
628     }
629     return args;
630   }
631   
632   static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) {
633     Object[] args = new Object[paramTypes.length];
634     for (int i = 0; i < args.length; i++) {
635       Class<?> paramType = paramTypes[i];
636       if (paramType == TokenStream.class) {
637         args[i] = stream;
638       } else if (paramType == CommonGramsFilter.class) {
639         // TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly
640         args[i] = new CommonGramsFilter(stream, newRandomArg(random, CharArraySet.class));
641       } else {
642         args[i] = newRandomArg(random, paramType);
643       }
644     }
645     return args;
646   }
647 
648   static class MockRandomAnalyzer extends Analyzer {
649     final long seed;
650     
651     MockRandomAnalyzer(long seed) {
652       this.seed = seed;
653     }
654 
655     public boolean offsetsAreCorrect() {
656       // TODO: can we not do the full chain here!?
657       Random random = new Random(seed);
658       TokenizerSpec tokenizerSpec = newTokenizer(random);
659       TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
660       return filterSpec.offsetsAreCorrect;
661     }
662     
663     @Override
664     protected TokenStreamComponents createComponents(String fieldName) {
665       Random random = new Random(seed);
666       TokenizerSpec tokenizerSpec = newTokenizer(random);
667       //System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString);
668       TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
669       //System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString);
670       return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
671     }
672 
673     @Override
674     protected Reader initReader(String fieldName, Reader reader) {
675       Random random = new Random(seed);
676       CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
677       return charfilterspec.reader;
678     }
679 
680     @Override
681     public String toString() {
682       Random random = new Random(seed);
683       StringBuilder sb = new StringBuilder();
684       CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
685       sb.append("\ncharfilters=");
686       sb.append(charFilterSpec.toString);
687       // intentional: initReader gets its own separate random
688       random = new Random(seed);
689       TokenizerSpec tokenizerSpec = newTokenizer(random);
690       sb.append("\n");
691       sb.append("tokenizer=");
692       sb.append(tokenizerSpec.toString);
693       TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
694       sb.append("\n");
695       sb.append("filters=");
696       sb.append(tokenFilterSpec.toString);
697       sb.append("\n");
698       sb.append("offsetsAreCorrect=" + tokenFilterSpec.offsetsAreCorrect);
699       return sb.toString();
700     }
701     
702     private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr) {
703       try {
704         final T instance = ctor.newInstance(args);
705         /*
706         if (descr.length() > 0) {
707           descr.append(",");
708         }
709         */
710         descr.append("\n  ");
711         descr.append(ctor.getDeclaringClass().getName());
712         String params = Arrays.deepToString(args);
713         params = params.substring(1, params.length()-1);
714         descr.append("(").append(params).append(")");
715         return instance;
716       } catch (InvocationTargetException ite) {
717         final Throwable cause = ite.getCause();
718         if (cause instanceof IllegalArgumentException ||
719             cause instanceof UnsupportedOperationException) {
720           // thats ok, ignore
721           if (VERBOSE) {
722             System.err.println("Ignoring IAE/UOE from ctor:");
723             cause.printStackTrace(System.err);
724           }
725         } else {
726           Rethrow.rethrow(cause);
727         }
728       } catch (IllegalAccessException | InstantiationException iae) {
729         Rethrow.rethrow(iae);
730       }
731       return null; // no success
732     }
733 
734     private boolean broken(Constructor<?> ctor, Object[] args) {
735       final Predicate<Object[]> pred = brokenConstructors.get(ctor);
736       return pred != null && pred.apply(args);
737     }
738 
739     private boolean brokenOffsets(Constructor<?> ctor, Object[] args) {
740       final Predicate<Object[]> pred = brokenOffsetsConstructors.get(ctor);
741       return pred != null && pred.apply(args);
742     }
743 
744     // create a new random tokenizer from classpath
745     private TokenizerSpec newTokenizer(Random random) {
746       TokenizerSpec spec = new TokenizerSpec();
747       while (spec.tokenizer == null) {
748         final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
749         final StringBuilder descr = new StringBuilder();
750         final Object args[] = newTokenizerArgs(random, ctor.getParameterTypes());
751         if (broken(ctor, args)) {
752           continue;
753         }
754         spec.tokenizer = createComponent(ctor, args, descr);
755         if (spec.tokenizer != null) {
756           spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
757           spec.toString = descr.toString();
758         }
759       }
760       return spec;
761     }
762     
763     private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
764       CharFilterSpec spec = new CharFilterSpec();
765       spec.reader = reader;
766       StringBuilder descr = new StringBuilder();
767       int numFilters = random.nextInt(3);
768       for (int i = 0; i < numFilters; i++) {
769         while (true) {
770           final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
771           final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
772           if (broken(ctor, args)) {
773             continue;
774           }
775           reader = createComponent(ctor, args, descr);
776           if (reader != null) {
777             spec.reader = reader;
778             break;
779           }
780         }
781       }
782       spec.toString = descr.toString();
783       return spec;
784     }
785     
786     private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
787       TokenFilterSpec spec = new TokenFilterSpec();
788       spec.offsetsAreCorrect = offsetsAreCorrect;
789       spec.stream = tokenizer;
790       StringBuilder descr = new StringBuilder();
791       int numFilters = random.nextInt(5);
792       for (int i = 0; i < numFilters; i++) {
793 
794         // Insert ValidatingTF after each stage so we can
795         // catch problems right after the TF that "caused"
796         // them:
797         spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
798 
799         while (true) {
800           final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
801           
802           // hack: MockGraph/MockLookahead has assertions that will trip if they follow
803           // an offsets violator. so we cant use them after e.g. wikipediatokenizer
804           if (!spec.offsetsAreCorrect &&
805               (ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)
806                || ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) {
807             continue;
808           }
809           
810           final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
811           if (broken(ctor, args)) {
812             continue;
813           }
814           final TokenFilter flt = createComponent(ctor, args, descr);
815           if (flt != null) {
816             spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
817             spec.stream = flt;
818             break;
819           }
820         }
821       }
822 
823       // Insert ValidatingTF after each stage so we can
824       // catch problems right after the TF that "caused"
825       // them:
826       spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
827 
828       spec.toString = descr.toString();
829       return spec;
830     }
831   }
832   
833   static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
834     boolean readSomething;
835     
836     CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
837       super(in);
838     }
839     
840     @Override
841     public int correct(int currentOff) {
842       return currentOff; // we don't change any offsets
843     }
844 
845     @Override
846     public int read(char[] cbuf, int off, int len) throws IOException {
847       readSomething = true;
848       return input.read(cbuf, off, len);
849     }
850 
851     @Override
852     public int read() throws IOException {
853       readSomething = true;
854       return input.read();
855     }
856 
857     @Override
858     public int read(CharBuffer target) throws IOException {
859       readSomething = true;
860       return input.read(target);
861     }
862 
863     @Override
864     public int read(char[] cbuf) throws IOException {
865       readSomething = true;
866       return input.read(cbuf);
867     }
868 
869     @Override
870     public long skip(long n) throws IOException {
871       readSomething = true;
872       return input.skip(n);
873     }
874 
875     @Override
876     public void mark(int readAheadLimit) throws IOException {
877       input.mark(readAheadLimit);
878     }
879 
880     @Override
881     public boolean markSupported() {
882       return input.markSupported();
883     }
884 
885     @Override
886     public boolean ready() throws IOException {
887       return input.ready();
888     }
889 
890     @Override
891     public void reset() throws IOException {
892       input.reset();
893     }
894   }
895   
896   static class TokenizerSpec {
897     Tokenizer tokenizer;
898     String toString;
899     boolean offsetsAreCorrect = true;
900   }
901   
902   static class TokenFilterSpec {
903     TokenStream stream;
904     String toString;
905     boolean offsetsAreCorrect = true;
906   }
907   
908   static class CharFilterSpec {
909     Reader reader;
910     String toString;
911   }
912   
913   public void testRandomChains() throws Throwable {
914     int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
915     Random random = random();
916     for (int i = 0; i < numIterations; i++) {
917       try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) {
918         if (VERBOSE) {
919           System.out.println("Creating random analyzer:" + a);
920         }
921         try {
922           checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false,
923               false /* We already validate our own offsets... */);
924         } catch (Throwable e) {
925           System.err.println("Exception from random analyzer: " + a);
926           throw e;
927         }
928       }
929     }
930   }
931   
932   // we might regret this decision...
933   public void testRandomChainsWithLargeStrings() throws Throwable {
934     int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
935     Random random = random();
936     for (int i = 0; i < numIterations; i++) {
937       try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) {
938         if (VERBOSE) {
939           System.out.println("Creating random analyzer:" + a);
940         }
941         try {
942           checkRandomData(random, a, 50*RANDOM_MULTIPLIER, 80, false,
943               false /* We already validate our own offsets... */);
944         } catch (Throwable e) {
945           System.err.println("Exception from random analyzer: " + a);
946           throw e;
947         }
948       }
949     }
950   }
951 }